Store Sales Prediction With Linear Regression.

Import Library

In [1]:
#Basic python library which need to import
import pandas as pd
import numpy as np

#Date stuff
from datetime import datetime
from datetime import timedelta

#Library for Nice graphing
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
%matplotlib inline

#Library for statistics operation
import scipy.stats as stats

# Date Time library
from datetime import datetime

#Machine learning Library
import statsmodels.api as sm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Import Data from Csv files

In [2]:
# Importing training data set
train = pd.read_csv("train.csv")

#Import Test Data
test=pd.read_csv("test.csv")

# Import Store data set
stores = pd.read_csv("stores.csv")

# Now import features data set
feature = pd.read_csv("features.csv")

Merge the data sets:

  -(train+Store+Feature)
  -(test+Store+Feature)
In [3]:
# For Train data set
train_bt = pd.merge(train,stores) 
train = pd.merge(train_bt,feature)

#For test data set
test_bt = pd.merge(test,stores)
test= pd.merge(test_bt,feature)
In [4]:
train.head(2)
Out[4]:
Store Dept Date Weekly_Sales IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment
0 1 1 2010-02-05 24924.50 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
1 1 2 2010-02-05 50605.27 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
In [5]:
test.head(2)
Out[5]:
Store Dept Date IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment
0 1 1 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573
1 1 2 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573
In [6]:
print (train.info())
print ("*****************************************")
print (test.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 16 columns):
Store           421570 non-null int64
Dept            421570 non-null int64
Date            421570 non-null object
Weekly_Sales    421570 non-null float64
IsHoliday       421570 non-null bool
Type            421570 non-null object
Size            421570 non-null int64
Temperature     421570 non-null float64
Fuel_Price      421570 non-null float64
MarkDown1       150681 non-null float64
MarkDown2       111248 non-null float64
MarkDown3       137091 non-null float64
MarkDown4       134967 non-null float64
MarkDown5       151432 non-null float64
CPI             421570 non-null float64
Unemployment    421570 non-null float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 51.9+ MB
None
*****************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 115064 entries, 0 to 115063
Data columns (total 15 columns):
Store           115064 non-null int64
Dept            115064 non-null int64
Date            115064 non-null object
IsHoliday       115064 non-null bool
Type            115064 non-null object
Size            115064 non-null int64
Temperature     115064 non-null float64
Fuel_Price      115064 non-null float64
MarkDown1       114915 non-null float64
MarkDown2       86437 non-null float64
MarkDown3       105235 non-null float64
MarkDown4       102176 non-null float64
MarkDown5       115064 non-null float64
CPI             76902 non-null float64
Unemployment    76902 non-null float64
dtypes: bool(1), float64(9), int64(3), object(2)
memory usage: 13.3+ MB
None

Select only positive weekly sales

In [7]:
# tale only those values whose sales is positive. 
train = train[train['Weekly_Sales']>0]

Data Description:

1. Training Data

In [8]:
numeric_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object']]

# Train Numerical Data
train_num=train[numeric_var_train]

# Train Categorical Data
train_cat=train[cat_var_train]

print (numeric_var_train)
print (cat_var_train)
['Store', 'Dept', 'Weekly_Sales', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']
['Date', 'Type']
In [12]:
# Use a general function that returns multiple values
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])
In [ ]:
num_summary=train_num.apply(lambda x: var_summary(x)).T
num_summary
In [9]:
def cat_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.value_counts()], 
                  index=['N', 'NMISS', 'ColumnsNames'])

cat_summary=train_cat.apply(lambda x: cat_summary(x))
cat_summary
Out[9]:
Date Type
N 420212 420212
NMISS 0 0
ColumnsNames 2011-12-23 3018 2011-11-25 3016 2011-12-... A 214961 B 162787 C 42464 Name: Type...

2. Testing Data

In [10]:
numeric_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['object']]

# Train Numerical Data
test_num=test[numeric_var_test]

# Train Categorical Data
test_cat=test[cat_var_test]

print (numeric_var_test)
print (cat_var_test)
['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']
['Date', 'Type']
In [13]:
# Numerical data summary report
num_summary=test_num.apply(lambda x: var_summary(x)).T

num_summary.head()
Out[13]:
N NMISS SUM MEAN MEDIAN STD VAR MIN P1 P5 P10 P25 P50 P75 P90 P95 P99 MAX
Store 115064.0 0.0 2.558817e+06 22.238207 22.000 12.809930 1.640943e+02 1.000 1.000 3.000 5.000 11.000 22.000 33.000 40.000 43.000 45.000 45.000
Dept 115064.0 0.0 5.101883e+06 44.339524 37.000 30.656410 9.398155e+02 1.000 1.000 4.000 7.000 18.000 37.000 74.000 92.000 95.000 98.000 99.000
Size 115064.0 0.0 1.570597e+10 136497.688921 140167.000 61106.926438 3.734056e+09 34875.000 34875.000 39690.000 39910.000 93638.000 140167.000 202505.000 204184.000 206302.000 219622.000 219622.000
Temperature 115064.0 0.0 6.206760e+06 53.941804 54.470 18.724153 3.505939e+02 -7.290 11.440 23.980 29.970 39.820 54.470 67.350 79.480 83.820 92.140 101.950
Fuel_Price 115064.0 0.0 4.121070e+05 3.581546 3.606 0.239442 5.733244e-02 2.872 2.957 3.161 3.227 3.431 3.606 3.766 3.866 3.951 4.079 4.125
In [19]:
# categorical data summary report
def cat_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.value_counts()], 
                  index=['N', 'NMISS', 'ColumnsNames'])

cat_summary=test_cat.apply(lambda x: cat_summary(x))
cat_summary
Out[19]:
Date Type
N 115064 115064
NMISS 0 0
ColumnsNames 2012-12-21 3002 2012-12-07 2989 2012-12-... A 58713 B 44500 C 11851 Name: Type, d...
In [21]:
# Run Pandas profilingto see the over all report
import pandas_profiling
pandas_profiling.ProfileReport(train)
Out[21]:

Overview

Dataset info

Number of variables 16
Number of observations 421570
Total Missing (%) 21.1%
Total size in memory 51.9 MiB
Average record size in memory 129.0 B

Variables types

Numeric 13
Categorical 2
Boolean 1
Date 0
Text (Unique) 0
Rejected 0
Unsupported 0

Warnings

  • Date has a high cardinality: 143 distinct values Warning
  • MarkDown1 has 270889 / 64.3% missing values Missing
  • MarkDown2 has 310322 / 73.6% missing values Missing
  • MarkDown3 has 284479 / 67.5% missing values Missing
  • MarkDown4 has 286603 / 68.0% missing values Missing
  • MarkDown5 has 270138 / 64.1% missing values Missing

Variables

CPI
Numeric

Distinct count 2145
Unique (%) 0.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 171.2
Minimum 126.06
Maximum 227.23
Zeros (%) 0.0%

Quantile statistics

Minimum 126.06
5-th percentile 126.5
Q1 132.02
Median 182.32
Q3 212.42
95-th percentile 221.94
Maximum 227.23
Range 101.17
Interquartile range 80.394

Descriptive statistics

Standard deviation 39.159
Coef of variation 0.22873
Kurtosis -1.8297
Mean 171.2
MAD 38.066
Skewness 0.085219
Sum 72174000
Variance 1533.4
Memory size 6.4 MiB
Value Count Frequency (%)  
129.8555333 711 0.2%
 
131.1083333 708 0.2%
 
129.84596670000002 707 0.2%
 
130.38490320000002 706 0.2%
 
130.683 706 0.2%
 
131.0756667 706 0.2%
 
130.6457931 706 0.2%
 
130.7196333 705 0.2%
 
130.4546207 705 0.2%
 
129.98454840000002 704 0.2%
 
Other values (2135) 414506 98.3%
 

Minimum 5 values

Value Count Frequency (%)  
126.064 678 0.2%
 
126.0766452 679 0.2%
 
126.08545159999998 675 0.2%
 
126.08929029999999 682 0.2%
 
126.1019355 686 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
227.01841659999997 69 0.0%
 
227.0369359 70 0.0%
 
227.16939190000002 63 0.0%
 
227.21428799999998 62 0.0%
 
227.2328068 63 0.0%
 

Date
Categorical

Distinct count 143
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
2011-12-23
 
3027
2011-11-25
 
3021
2011-12-16
 
3013
Other values (140)
412509
Value Count Frequency (%)  
2011-12-23 3027 0.7%
 
2011-11-25 3021 0.7%
 
2011-12-16 3013 0.7%
 
2011-12-09 3010 0.7%
 
2012-02-17 3007 0.7%
 
2011-12-30 3003 0.7%
 
2012-02-10 3001 0.7%
 
2011-12-02 2994 0.7%
 
2012-03-02 2990 0.7%
 
2012-10-12 2990 0.7%
 
Other values (133) 391514 92.9%
 

Dept
Numeric

Distinct count 81
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 44.26
Minimum 1
Maximum 99
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 4
Q1 18
Median 37
Q3 74
95-th percentile 95
Maximum 99
Range 98
Interquartile range 56

Descriptive statistics

Standard deviation 30.492
Coef of variation 0.68893
Kurtosis -1.2156
Mean 44.26
MAD 26.537
Skewness 0.35822
Sum 18658822
Variance 929.77
Memory size 6.4 MiB
Value Count Frequency (%)  
1 6435 1.5%
 
10 6435 1.5%
 
38 6435 1.5%
 
21 6435 1.5%
 
67 6435 1.5%
 
16 6435 1.5%
 
14 6435 1.5%
 
13 6435 1.5%
 
79 6435 1.5%
 
81 6435 1.5%
 
Other values (71) 357220 84.7%
 

Minimum 5 values

Value Count Frequency (%)  
1 6435 1.5%
 
2 6435 1.5%
 
3 6435 1.5%
 
4 6435 1.5%
 
5 6347 1.5%
 

Maximum 5 values

Value Count Frequency (%)  
95 6435 1.5%
 
96 4854 1.2%
 
97 6278 1.5%
 
98 5836 1.4%
 
99 862 0.2%
 

Fuel_Price
Numeric

Distinct count 892
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.361
Minimum 2.472
Maximum 4.468
Zeros (%) 0.0%

Quantile statistics

Minimum 2.472
5-th percentile 2.653
Q1 2.933
Median 3.452
Q3 3.738
95-th percentile 4.029
Maximum 4.468
Range 1.996
Interquartile range 0.805

Descriptive statistics

Standard deviation 0.45851
Coef of variation 0.13642
Kurtosis -1.1854
Mean 3.361
MAD 0.4032
Skewness -0.1049
Sum 1416900
Variance 0.21024
Memory size 6.4 MiB
Value Count Frequency (%)  
3.638 2548 0.6%
 
3.63 2164 0.5%
 
2.7710000000000004 1917 0.5%
 
3.891 1856 0.4%
 
3.594 1796 0.4%
 
3.5239999999999996 1793 0.4%
 
3.523 1792 0.4%
 
2.72 1790 0.4%
 
3.6660000000000004 1778 0.4%
 
2.78 1656 0.4%
 
Other values (882) 402480 95.5%
 

Minimum 5 values

Value Count Frequency (%)  
2.472 38 0.0%
 
2.513 45 0.0%
 
2.5140000000000002 906 0.2%
 
2.52 39 0.0%
 
2.533 42 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.294 363 0.1%
 
4.301 360 0.1%
 
4.308 168 0.0%
 
4.449 358 0.1%
 
4.468 368 0.1%
 

IsHoliday
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.070358
True
 
29661
(Missing)
391909
Value Count Frequency (%)  
True 29661 7.0%
 
(Missing) 391909 93.0%
 

MarkDown1
Numeric

Distinct count 2278
Unique (%) 0.5%
Missing (%) 64.3%
Missing (n) 270889
Infinite (%) 0.0%
Infinite (n) 0
Mean 7246.4
Minimum 0.27
Maximum 88647
Zeros (%) 0.0%

Quantile statistics

Minimum 0.27
5-th percentile 149.19
Q1 2240.3
Median 5347.4
Q3 9210.9
95-th percentile 21801
Maximum 88647
Range 88646
Interquartile range 6970.6

Descriptive statistics

Standard deviation 8291.2
Coef of variation 1.1442
Kurtosis 17.606
Mean 7246.4
MAD 5262.8
Skewness 3.3418
Sum 1091900000
Variance 68744000
Memory size 6.4 MiB
Value Count Frequency (%)  
1.5 102 0.0%
 
460.73 102 0.0%
 
175.64 93 0.0%
 
1282.42 75 0.0%
 
9264.48 75 0.0%
 
686.24 75 0.0%
 
5924.71 75 0.0%
 
1483.17 75 0.0%
 
3242.59 74 0.0%
 
10671.71 74 0.0%
 
Other values (2267) 149861 35.5%
 
(Missing) 270889 64.3%
 

Minimum 5 values

Value Count Frequency (%)  
0.27 51 0.0%
 
0.5 49 0.0%
 
1.5 102 0.0%
 
1.94 50 0.0%
 
2.12 52 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
62567.6 66 0.0%
 
65021.23 73 0.0%
 
75149.79 73 0.0%
 
78124.5 70 0.0%
 
88646.76 68 0.0%
 

MarkDown2
Numeric

Distinct count 1500
Unique (%) 0.4%
Missing (%) 73.6%
Missing (n) 310322
Infinite (%) 0.0%
Infinite (n) 0
Mean 3334.6
Minimum -265.76
Maximum 104520
Zeros (%) 0.0%

Quantile statistics

Minimum -265.76
5-th percentile 1.95
Q1 41.6
Median 192
Q3 1926.9
95-th percentile 16497
Maximum 104520
Range 104790
Interquartile range 1885.3

Descriptive statistics

Standard deviation 9475.4
Coef of variation 2.8415
Kurtosis 37.59
Mean 3334.6
MAD 4690.4
Skewness 5.4413
Sum 370970000
Variance 89782000
Memory size 6.4 MiB
Value Count Frequency (%)  
1.91 539 0.1%
 
3.0 493 0.1%
 
0.5 485 0.1%
 
1.5 471 0.1%
 
4.0 367 0.1%
 
6.0 365 0.1%
 
7.64 354 0.1%
 
3.82 353 0.1%
 
5.73 345 0.1%
 
19.0 345 0.1%
 
Other values (1489) 107131 25.4%
 
(Missing) 310322 73.6%
 

Minimum 5 values

Value Count Frequency (%)  
-265.76 71 0.0%
 
-192.0 72 0.0%
 
-20.0 72 0.0%
 
-10.98 60 0.0%
 
-10.5 143 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
82881.16 73 0.0%
 
89121.94 74 0.0%
 
92523.94 73 0.0%
 
97740.99 73 0.0%
 
104519.54 72 0.0%
 

MarkDown3
Numeric

Distinct count 1663
Unique (%) 0.4%
Missing (%) 67.5%
Missing (n) 284479
Infinite (%) 0.0%
Infinite (n) 0
Mean 1439.4
Minimum -29.1
Maximum 141630
Zeros (%) 0.0%

Quantile statistics

Minimum -29.1
5-th percentile 0.65
Q1 5.08
Median 24.6
Q3 103.99
95-th percentile 1059.9
Maximum 141630
Range 141660
Interquartile range 98.91

Descriptive statistics

Standard deviation 9623.1
Coef of variation 6.6854
Kurtosis 77.688
Mean 1439.4
MAD 2578.1
Skewness 8.3995
Sum 197330000
Variance 92604000
Memory size 6.4 MiB
Value Count Frequency (%)  
3.0 754 0.2%
 
6.0 710 0.2%
 
2.0 660 0.2%
 
1.0 611 0.1%
 
0.22 487 0.1%
 
0.5 463 0.1%
 
0.01 444 0.1%
 
4.0 439 0.1%
 
3.2 379 0.1%
 
1.98 363 0.1%
 
Other values (1652) 131781 31.3%
 
(Missing) 284479 67.5%
 

Minimum 5 values

Value Count Frequency (%)  
-29.1 72 0.0%
 
-1.0 70 0.0%
 
-0.87 46 0.0%
 
-0.2 69 0.0%
 
0.0 67 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
89402.64 71 0.0%
 
101378.79 73 0.0%
 
103991.94 72 0.0%
 
109030.75 75 0.0%
 
141630.61 74 0.0%
 

MarkDown4
Numeric

Distinct count 1945
Unique (%) 0.5%
Missing (%) 68.0%
Missing (n) 286603
Infinite (%) 0.0%
Infinite (n) 0
Mean 3383.2
Minimum 0.22
Maximum 67475
Zeros (%) 0.0%

Quantile statistics

Minimum 0.22
5-th percentile 28.76
Q1 504.22
Median 1481.3
Q3 3595
95-th percentile 12646
Maximum 67475
Range 67475
Interquartile range 3090.8

Descriptive statistics

Standard deviation 6292.4
Coef of variation 1.8599
Kurtosis 29.997
Mean 3383.2
MAD 3329.7
Skewness 4.8475
Sum 456620000
Variance 39594000
Memory size 6.4 MiB
Value Count Frequency (%)  
9.0 280 0.1%
 
4.0 200 0.0%
 
2.0 197 0.0%
 
3.0 146 0.0%
 
47.0 143 0.0%
 
67.72 142 0.0%
 
17.0 141 0.0%
 
657.56 141 0.0%
 
8.0 140 0.0%
 
1330.36 140 0.0%
 
Other values (1934) 133297 31.6%
 
(Missing) 286603 68.0%
 

Minimum 5 values

Value Count Frequency (%)  
0.22 57 0.0%
 
0.41 52 0.0%
 
0.46 48 0.0%
 
0.78 52 0.0%
 
0.87 49 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
52739.02 72 0.0%
 
53603.99 72 0.0%
 
57815.43 68 0.0%
 
57817.56 74 0.0%
 
67474.85 72 0.0%
 

MarkDown5
Numeric

Distinct count 2294
Unique (%) 0.5%
Missing (%) 64.1%
Missing (n) 270138
Infinite (%) 0.0%
Infinite (n) 0
Mean 4629
Minimum 135.16
Maximum 108520
Zeros (%) 0.0%

Quantile statistics

Minimum 135.16
5-th percentile 715.52
Q1 1878.4
Median 3359.4
Q3 5563.8
95-th percentile 11269
Maximum 108520
Range 108380
Interquartile range 3685.4

Descriptive statistics

Standard deviation 5962.9
Coef of variation 1.2882
Kurtosis 107.85
Mean 4629
MAD 2989.8
Skewness 8.1699
Sum 700970000
Variance 35556000
Memory size 6.4 MiB
Value Count Frequency (%)  
2743.18 136 0.0%
 
1064.56 120 0.0%
 
9083.54 75 0.0%
 
20371.02 75 0.0%
 
3567.03 75 0.0%
 
4180.29 75 0.0%
 
3557.67 75 0.0%
 
986.23 74 0.0%
 
1773.53 74 0.0%
 
14660.97 74 0.0%
 
Other values (2283) 150579 35.7%
 
(Missing) 270138 64.1%
 

Minimum 5 values

Value Count Frequency (%)  
135.16 65 0.0%
 
153.04 47 0.0%
 
153.9 49 0.0%
 
164.08 52 0.0%
 
170.64 69 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
58068.14 69 0.0%
 
63005.58 69 0.0%
 
85851.87 68 0.0%
 
105223.11 70 0.0%
 
108519.28 68 0.0%
 

Size
Numeric

Distinct count 40
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 136730
Minimum 34875
Maximum 219622
Zeros (%) 0.0%

Quantile statistics

Minimum 34875
5-th percentile 39690
Q1 93638
Median 140170
Q3 202500
95-th percentile 206300
Maximum 219622
Range 184747
Interquartile range 108870

Descriptive statistics

Standard deviation 60981
Coef of variation 0.446
Kurtosis -1.2063
Mean 136730
MAD 52517
Skewness -0.32585
Sum 57640387438
Variance 3718600000
Memory size 6.4 MiB
Value Count Frequency (%)  
39690 20802 4.9%
 
39910 20597 4.9%
 
203819 20376 4.8%
 
219622 10474 2.5%
 
126512 10315 2.4%
 
205863 10272 2.4%
 
151315 10244 2.4%
 
202307 10238 2.4%
 
204184 10225 2.4%
 
158114 10224 2.4%
 
Other values (30) 287803 68.3%
 

Minimum 5 values

Value Count Frequency (%)  
34875 8999 2.1%
 
37392 9036 2.1%
 
39690 20802 4.9%
 
39910 20597 4.9%
 
41062 6751 1.6%
 

Maximum 5 values

Value Count Frequency (%)  
204184 10225 2.4%
 
205863 10272 2.4%
 
206302 10113 2.4%
 
207499 10062 2.4%
 
219622 10474 2.5%
 

Store
Numeric

Distinct count 45
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 22.201
Minimum 1
Maximum 45
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 3
Q1 11
Median 22
Q3 33
95-th percentile 43
Maximum 45
Range 44
Interquartile range 22

Descriptive statistics

Standard deviation 12.785
Coef of variation 0.5759
Kurtosis -1.1465
Mean 22.201
MAD 10.996
Skewness 0.077763
Sum 9359084
Variance 163.46
Memory size 6.4 MiB
Value Count Frequency (%)  
13 10474 2.5%
 
10 10315 2.4%
 
4 10272 2.4%
 
1 10244 2.4%
 
2 10238 2.4%
 
24 10228 2.4%
 
27 10225 2.4%
 
34 10224 2.4%
 
20 10214 2.4%
 
6 10211 2.4%
 
Other values (35) 318925 75.7%
 

Minimum 5 values

Value Count Frequency (%)  
1 10244 2.4%
 
2 10238 2.4%
 
3 9036 2.1%
 
4 10272 2.4%
 
5 8999 2.1%
 

Maximum 5 values

Value Count Frequency (%)  
41 10088 2.4%
 
42 6953 1.6%
 
43 6751 1.6%
 
44 7169 1.7%
 
45 9637 2.3%
 

Temperature
Numeric

Distinct count 3528
Unique (%) 0.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 60.09
Minimum -2.06
Maximum 100.14
Zeros (%) 0.0%

Quantile statistics

Minimum -2.06
5-th percentile 27.31
Q1 46.68
Median 62.09
Q3 74.28
95-th percentile 87.27
Maximum 100.14
Range 102.2
Interquartile range 27.6

Descriptive statistics

Standard deviation 18.448
Coef of variation 0.307
Kurtosis -0.63592
Mean 60.09
MAD 15.377
Skewness -0.3214
Sum 25332000
Variance 340.33
Memory size 6.4 MiB
Value Count Frequency (%)  
50.43 709 0.2%
 
67.87 646 0.2%
 
72.62 594 0.1%
 
76.67 583 0.1%
 
70.28 563 0.1%
 
76.03 555 0.1%
 
50.56 544 0.1%
 
64.05 542 0.1%
 
64.21 519 0.1%
 
50.81 487 0.1%
 
Other values (3518) 415828 98.6%
 

Minimum 5 values

Value Count Frequency (%)  
-2.06 69 0.0%
 
5.54 68 0.0%
 
6.23 69 0.0%
 
7.46 69 0.0%
 
9.51 70 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
99.2 46 0.0%
 
99.22 185 0.0%
 
99.66 48 0.0%
 
100.07 46 0.0%
 
100.14 44 0.0%
 

Type
Categorical

Distinct count 3
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
A
215478
B
163495
C
 
42597
Value Count Frequency (%)  
A 215478 51.1%
 
B 163495 38.8%
 
C 42597 10.1%
 

Unemployment
Numeric

Distinct count 349
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.9603
Minimum 3.879
Maximum 14.313
Zeros (%) 0.0%

Quantile statistics

Minimum 3.879
5-th percentile 5.326
Q1 6.891
Median 7.866
Q3 8.572
95-th percentile 12.187
Maximum 14.313
Range 10.434
Interquartile range 1.681

Descriptive statistics

Standard deviation 1.8633
Coef of variation 0.23407
Kurtosis 2.7312
Mean 7.9603
MAD 1.283
Skewness 1.1837
Sum 3355800
Variance 3.4719
Memory size 6.4 MiB
Value Count Frequency (%)  
8.099 5152 1.2%
 
8.163 3636 0.9%
 
7.852 3614 0.9%
 
7.343 3416 0.8%
 
7.057 3414 0.8%
 
7.931 3400 0.8%
 
7.441 3397 0.8%
 
6.565 3370 0.8%
 
8.2 3361 0.8%
 
6.891 3360 0.8%
 
Other values (339) 385450 91.4%
 

Minimum 5 values

Value Count Frequency (%)  
3.8789999999999996 287 0.1%
 
4.077 938 0.2%
 
4.125 1831 0.4%
 
4.145 562 0.1%
 
4.156000000000001 1815 0.4%
 

Maximum 5 values

Value Count Frequency (%)  
13.975 1529 0.4%
 
14.020999999999999 2263 0.5%
 
14.099 2441 0.6%
 
14.18 2423 0.6%
 
14.312999999999999 2636 0.6%
 

Weekly_Sales
Numeric

Distinct count 359464
Unique (%) 85.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15981
Minimum -4988.9
Maximum 693100
Zeros (%) 0.0%

Quantile statistics

Minimum -4988.9
5-th percentile 59.975
Q1 2079.7
Median 7612
Q3 20206
95-th percentile 61202
Maximum 693100
Range 698090
Interquartile range 18126

Descriptive statistics

Standard deviation 22711
Coef of variation 1.4211
Kurtosis 21.491
Mean 15981
MAD 15161
Skewness 3.262
Sum 6737200000
Variance 515800000
Memory size 6.4 MiB
Value Count Frequency (%)  
10.0 353 0.1%
 
5.0 289 0.1%
 
20.0 232 0.1%
 
15.0 215 0.1%
 
12.0 175 0.0%
 
1.0 169 0.0%
 
10.47 167 0.0%
 
11.97 154 0.0%
 
2.0 148 0.0%
 
7.0 146 0.0%
 
Other values (359454) 419522 99.5%
 

Minimum 5 values

Value Count Frequency (%)  
-4988.94 1 0.0%
 
-3924.0 1 0.0%
 
-1750.0 1 0.0%
 
-1699.0 1 0.0%
 
-1321.48 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
474330.1 1 0.0%
 
627962.93 1 0.0%
 
630999.19 1 0.0%
 
649770.18 1 0.0%
 
693099.36 1 0.0%
 

Correlations

Sample

Store Dept Date Weekly_Sales IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment
0 1 1 2010-02-05 24924.50 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
1 1 2 2010-02-05 50605.27 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
2 1 3 2010-02-05 13740.12 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
3 1 4 2010-02-05 39954.04 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
4 1 5 2010-02-05 32229.38 False A 151315 42.31 2.572 NaN NaN NaN NaN NaN 211.096358 8.106
In [22]:
pandas_profiling.ProfileReport(test)
Out[22]:

Overview

Dataset info

Number of variables 15
Number of observations 115064
Total Missing (%) 7.4%
Total size in memory 13.3 MiB
Average record size in memory 121.0 B

Variables types

Numeric 12
Categorical 2
Boolean 1
Date 0
Text (Unique) 0
Rejected 0
Unsupported 0

Warnings

  • CPI has 38162 / 33.2% missing values Missing
  • MarkDown2 has 28627 / 24.9% missing values Missing
  • MarkDown3 has 9829 / 8.5% missing values Missing
  • MarkDown4 has 12888 / 11.2% missing values Missing
  • MarkDown5 is highly skewed (γ1 = 37.977) Skewed
  • Unemployment has 38162 / 33.2% missing values Missing

Variables

CPI
Numeric

Distinct count 361
Unique (%) 0.3%
Missing (%) 33.2%
Missing (n) 38162
Infinite (%) 0.0%
Infinite (n) 0
Mean 176.96
Minimum 131.24
Maximum 228.98
Zeros (%) 0.0%

Quantile statistics

Minimum 131.24
5-th percentile 131.48
Q1 138.4
Median 192.3
Q3 223.24
95-th percentile 227.78
Maximum 228.98
Range 97.74
Interquartile range 84.842

Descriptive statistics

Standard deviation 41.24
Coef of variation 0.23305
Kurtosis -1.8588
Mean 176.96
MAD 40.222
Skewness 0.071448
Sum 13609000
Variance 1700.7
Memory size 1.8 MiB
Value Count Frequency (%)  
132.71609679999997 2080 1.8%
 
139.1226129 1664 1.4%
 
201.0705712 825 0.7%
 
224.80253140000002 783 0.7%
 
131.537 704 0.6%
 
132.2725714 703 0.6%
 
131.2793548 702 0.6%
 
131.642 702 0.6%
 
131.4784 701 0.6%
 
132.65377420000002 698 0.6%
 
Other values (350) 67340 58.5%
 
(Missing) 38162 33.2%
 

Minimum 5 values

Value Count Frequency (%)  
131.2362258 695 0.6%
 
131.2793548 702 0.6%
 
131.3258 696 0.6%
 
131.37666670000002 695 0.6%
 
131.4275333 693 0.6%
 

Maximum 5 values

Value Count Frequency (%)  
228.72986380000003 401 0.3%
 
228.7796682 208 0.2%
 
228.8020401 60 0.1%
 
228.8892482 60 0.1%
 
228.9764563 186 0.2%
 

Date
Categorical

Distinct count 39
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
2012-12-21
 
3002
2012-12-07
 
2989
2012-12-28
 
2988
Other values (36)
106085
Value Count Frequency (%)  
2012-12-21 3002 2.6%
 
2012-12-07 2989 2.6%
 
2012-12-28 2988 2.6%
 
2012-12-14 2986 2.6%
 
2013-02-15 2984 2.6%
 
2012-11-23 2976 2.6%
 
2012-11-09 2971 2.6%
 
2013-01-04 2964 2.6%
 
2013-02-08 2964 2.6%
 
2012-11-30 2962 2.6%
 
Other values (29) 85278 74.1%
 

Dept
Numeric

Distinct count 81
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 44.34
Minimum 1
Maximum 99
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 4
Q1 18
Median 37
Q3 74
95-th percentile 95
Maximum 99
Range 98
Interquartile range 56

Descriptive statistics

Standard deviation 30.656
Coef of variation 0.6914
Kurtosis -1.2242
Mean 44.34
MAD 26.74
Skewness 0.36242
Sum 5101883
Variance 939.82
Memory size 1.8 MiB
Value Count Frequency (%)  
1 1755 1.5%
 
13 1755 1.5%
 
91 1755 1.5%
 
90 1755 1.5%
 
21 1755 1.5%
 
38 1755 1.5%
 
82 1755 1.5%
 
40 1755 1.5%
 
81 1755 1.5%
 
16 1755 1.5%
 
Other values (71) 97514 84.7%
 

Minimum 5 values

Value Count Frequency (%)  
1 1755 1.5%
 
2 1755 1.5%
 
3 1755 1.5%
 
4 1755 1.5%
 
5 1738 1.5%
 

Maximum 5 values

Value Count Frequency (%)  
95 1755 1.5%
 
96 1350 1.2%
 
97 1716 1.5%
 
98 1632 1.4%
 
99 613 0.5%
 

Fuel_Price
Numeric

Distinct count 297
Unique (%) 0.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.5815
Minimum 2.872
Maximum 4.125
Zeros (%) 0.0%

Quantile statistics

Minimum 2.872
5-th percentile 3.161
Q1 3.431
Median 3.606
Q3 3.766
95-th percentile 3.951
Maximum 4.125
Range 1.253
Interquartile range 0.335

Descriptive statistics

Standard deviation 0.23944
Coef of variation 0.066854
Kurtosis -0.1176
Mean 3.5815
MAD 0.18861
Skewness -0.39128
Sum 412110
Variance 0.057332
Memory size 1.8 MiB
Value Count Frequency (%)  
3.417 1853 1.6%
 
3.583 1851 1.6%
 
3.386 1793 1.6%
 
3.611 1374 1.2%
 
3.108 1201 1.0%
 
3.4789999999999996 1169 1.0%
 
3.597 1071 0.9%
 
3.451 1043 0.9%
 
3.227 1040 0.9%
 
3.614 1028 0.9%
 
Other values (287) 101641 88.3%
 

Minimum 5 values

Value Count Frequency (%)  
2.872 276 0.2%
 
2.889 276 0.2%
 
2.9139999999999997 193 0.2%
 
2.927 194 0.2%
 
2.957 279 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
4.079 282 0.2%
 
4.099 355 0.3%
 
4.104 186 0.2%
 
4.109 189 0.2%
 
4.125 166 0.1%
 

IsHoliday
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.077592
True
 
8928
(Missing)
106136
Value Count Frequency (%)  
True 8928 7.8%
 
(Missing) 106136 92.2%
 

MarkDown1
Numeric

Distinct count 1753
Unique (%) 1.5%
Missing (%) 0.1%
Missing (n) 149
Infinite (%) 0.0%
Infinite (n) 0
Mean 7689.2
Minimum -2781.4
Maximum 103180
Zeros (%) 0.0%

Quantile statistics

Minimum -2781.4
5-th percentile 189.49
Q1 1966.5
Median 4842.3
Q3 9439.1
95-th percentile 23141
Maximum 103180
Range 105970
Interquartile range 7472.7

Descriptive statistics

Standard deviation 10699
Coef of variation 1.3914
Kurtosis 22.871
Mean 7689.2
MAD 6160.2
Skewness 4.1727
Sum 883610000
Variance 114460000
Memory size 1.8 MiB
Value Count Frequency (%)  
4655.55 74 0.1%
 
13357.31 74 0.1%
 
22673.11 74 0.1%
 
13613.52 74 0.1%
 
5692.66 74 0.1%
 
10755.57 74 0.1%
 
9753.88 74 0.1%
 
20297.6 74 0.1%
 
5813.45 73 0.1%
 
7701.72 73 0.1%
 
Other values (1742) 114177 99.2%
 
(Missing) 149 0.1%
 

Minimum 5 values

Value Count Frequency (%)  
-2781.45 50 0.0%
 
-772.21 43 0.0%
 
-563.9 70 0.1%
 
-16.93 44 0.0%
 
2.14 46 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
80498.65 71 0.1%
 
84139.36 72 0.1%
 
88750.34 66 0.1%
 
95102.5 71 0.1%
 
103184.98 72 0.1%
 

MarkDown2
Numeric

Distinct count 1258
Unique (%) 1.1%
Missing (%) 24.9%
Missing (n) 28627
Infinite (%) 0.0%
Infinite (n) 0
Mean 3734.1
Minimum -35.74
Maximum 71074
Zeros (%) 0.0%

Quantile statistics

Minimum -35.74
5-th percentile 6.14
Q1 180.35
Median 742.59
Q3 2735.7
95-th percentile 22672
Maximum 71074
Range 71110
Interquartile range 2555.3

Descriptive statistics

Standard deviation 8323.5
Coef of variation 2.2291
Kurtosis 15.881
Mean 3734.1
MAD 4697.7
Skewness 3.7406
Sum 322760000
Variance 69281000
Memory size 1.8 MiB
Value Count Frequency (%)  
0.01 346 0.3%
 
0.03 340 0.3%
 
82.92 217 0.2%
 
11.0 214 0.2%
 
3.0 209 0.2%
 
4.0 191 0.2%
 
104.92 141 0.1%
 
1.49 138 0.1%
 
0.06 138 0.1%
 
7.5 137 0.1%
 
Other values (1247) 84366 73.3%
 
(Missing) 28627 24.9%
 

Minimum 5 values

Value Count Frequency (%)  
-35.74 63 0.1%
 
-15.45 71 0.1%
 
-7.76 65 0.1%
 
-3.27 69 0.1%
 
-0.05 73 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
52304.87 73 0.1%
 
52850.71 74 0.1%
 
56549.69 73 0.1%
 
59362.3 72 0.1%
 
71074.17 72 0.1%
 

MarkDown3
Numeric

Distinct count 1422
Unique (%) 1.2%
Missing (%) 8.5%
Missing (n) 9829
Infinite (%) 0.0%
Infinite (n) 0
Mean 2403.1
Minimum -179.26
Maximum 149480
Zeros (%) 0.0%

Quantile statistics

Minimum -179.26
5-th percentile 1.18
Q1 15.1
Median 78.26
Q3 272.58
95-th percentile 2361.6
Maximum 149480
Range 149660
Interquartile range 257.48

Descriptive statistics

Standard deviation 13768
Coef of variation 5.7293
Kurtosis 54.091
Mean 2403.1
MAD 4226.8
Skewness 7.1461
Sum 252890000
Variance 189560000
Memory size 1.8 MiB
Value Count Frequency (%)  
1.2 599 0.5%
 
1.0 498 0.4%
 
0.6 419 0.4%
 
0.8 348 0.3%
 
2.0 324 0.3%
 
0.4 278 0.2%
 
0.2 272 0.2%
 
5.0 271 0.2%
 
3.0 271 0.2%
 
0.1 269 0.2%
 
Other values (1411) 101686 88.4%
 
(Missing) 9829 8.5%
 

Minimum 5 values

Value Count Frequency (%)  
-179.26 62 0.1%
 
-89.1 66 0.1%
 
-44.54 67 0.1%
 
-23.97 72 0.1%
 
-17.44 69 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
115048.81 73 0.1%
 
130129.11 70 0.1%
 
139621.51 72 0.1%
 
146394.44 72 0.1%
 
149483.31 73 0.1%
 

MarkDown4
Numeric

Distinct count 1484
Unique (%) 1.3%
Missing (%) 11.2%
Missing (n) 12888
Infinite (%) 0.0%
Infinite (n) 0
Mean 3356.2
Minimum 0.22
Maximum 65345
Zeros (%) 0.0%

Quantile statistics

Minimum 0.22
5-th percentile 16.96
Q1 155.46
Median 840.94
Q3 3096.9
95-th percentile 14191
Maximum 65345
Range 65344
Interquartile range 2941.5

Descriptive statistics

Standard deviation 7570.5
Coef of variation 2.2557
Kurtosis 25.452
Mean 3356.2
MAD 3897.5
Skewness 4.6686
Sum 342930000
Variance 57312000
Memory size 1.8 MiB
Value Count Frequency (%)  
3.0 171 0.1%
 
0.63 154 0.1%
 
358.15 145 0.1%
 
55.46 142 0.1%
 
2.61 141 0.1%
 
3.97 138 0.1%
 
4.88 137 0.1%
 
27.44 136 0.1%
 
970.77 134 0.1%
 
1.92 120 0.1%
 
Other values (1473) 100758 87.6%
 
(Missing) 12888 11.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.22 56 0.0%
 
0.63 154 0.1%
 
0.66 46 0.0%
 
0.78 54 0.0%
 
1.26 43 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
56735.25 72 0.1%
 
60065.82 72 0.1%
 
63130.81 70 0.1%
 
63830.91 71 0.1%
 
65344.64 72 0.1%
 

MarkDown5
Numeric

Distinct count 1754
Unique (%) 1.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3922.7
Minimum -185.17
Maximum 771450
Zeros (%) 0.0%

Quantile statistics

Minimum -185.17
5-th percentile 540.89
Q1 1309.3
Median 2390.4
Q3 4227.3
95-th percentile 9316.7
Maximum 771450
Range 771630
Interquartile range 2918

Descriptive statistics

Standard deviation 19445
Coef of variation 4.9571
Kurtosis 1494.9
Mean 3922.7
MAD 2983.7
Skewness 37.977
Sum 451360000
Variance 378110000
Memory size 1.8 MiB
Value Count Frequency (%)  
3113.78 137 0.1%
 
7968.28 74 0.1%
 
2105.14 74 0.1%
 
18831.34 74 0.1%
 
22677.91 74 0.1%
 
2167.73 74 0.1%
 
1947.25 74 0.1%
 
21807.99 74 0.1%
 
5449.98 74 0.1%
 
860.36 73 0.1%
 
Other values (1744) 114262 99.3%
 

Minimum 5 values

Value Count Frequency (%)  
-185.17 63 0.1%
 
-37.02 73 0.1%
 
40.98 44 0.0%
 
60.92 65 0.1%
 
114.25 51 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
35238.98 72 0.1%
 
43336.34 70 0.1%
 
45050.55 70 0.1%
 
45648.88 69 0.1%
 
771448.1 71 0.1%
 

Size
Numeric

Distinct count 40
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 136500
Minimum 34875
Maximum 219622
Zeros (%) 0.0%

Quantile statistics

Minimum 34875
5-th percentile 39690
Q1 93638
Median 140170
Q3 202500
95-th percentile 206300
Maximum 219622
Range 184747
Interquartile range 108870

Descriptive statistics

Standard deviation 61107
Coef of variation 0.44768
Kurtosis -1.2144
Mean 136500
MAD 52641
Skewness -0.32195
Sum 15705970078
Variance 3734100000
Memory size 1.8 MiB
Value Count Frequency (%)  
39910 5803 5.0%
 
39690 5702 5.0%
 
203819 5589 4.9%
 
219622 2836 2.5%
 
205863 2803 2.4%
 
202307 2797 2.4%
 
204184 2791 2.4%
 
202505 2788 2.4%
 
151315 2783 2.4%
 
126512 2782 2.4%
 
Other values (30) 78390 68.1%
 

Minimum 5 values

Value Count Frequency (%)  
34875 2447 2.1%
 
37392 2473 2.1%
 
39690 5702 5.0%
 
39910 5803 5.0%
 
41062 1863 1.6%
 

Maximum 5 values

Value Count Frequency (%)  
204184 2791 2.4%
 
205863 2803 2.4%
 
206302 2745 2.4%
 
207499 2756 2.4%
 
219622 2836 2.5%
 

Store
Numeric

Distinct count 45
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 22.238
Minimum 1
Maximum 45
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 3
Q1 11
Median 22
Q3 33
95-th percentile 43
Maximum 45
Range 44
Interquartile range 22

Descriptive statistics

Standard deviation 12.81
Coef of variation 0.57603
Kurtosis -1.1498
Mean 22.238
MAD 11.02
Skewness 0.076773
Sum 2558817
Variance 164.09
Memory size 1.8 MiB
Value Count Frequency (%)  
13 2836 2.5%
 
4 2803 2.4%
 
19 2799 2.4%
 
2 2797 2.4%
 
27 2791 2.4%
 
24 2790 2.4%
 
6 2788 2.4%
 
1 2783 2.4%
 
10 2782 2.4%
 
20 2774 2.4%
 
Other values (35) 87121 75.7%
 

Minimum 5 values

Value Count Frequency (%)  
1 2783 2.4%
 
2 2797 2.4%
 
3 2473 2.1%
 
4 2803 2.4%
 
5 2447 2.1%
 

Maximum 5 values

Value Count Frequency (%)  
41 2754 2.4%
 
42 1962 1.7%
 
43 1863 1.6%
 
44 2072 1.8%
 
45 2626 2.3%
 

Temperature
Numeric

Distinct count 1236
Unique (%) 1.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 53.942
Minimum -7.29
Maximum 101.95
Zeros (%) 0.0%

Quantile statistics

Minimum -7.29
5-th percentile 23.98
Q1 39.82
Median 54.47
Q3 67.35
95-th percentile 83.82
Maximum 101.95
Range 109.24
Interquartile range 27.53

Descriptive statistics

Standard deviation 18.724
Coef of variation 0.34712
Kurtosis -0.49597
Mean 53.942
MAD 15.417
Skewness -0.07357
Sum 6206800
Variance 350.59
Memory size 1.8 MiB
Value Count Frequency (%)  
57.25 312 0.3%
 
70.74 309 0.3%
 
70.18 309 0.3%
 
38.95 272 0.2%
 
70.01 263 0.2%
 
57.87 262 0.2%
 
85.0 261 0.2%
 
52.38 260 0.2%
 
79.15 260 0.2%
 
58.66 259 0.2%
 
Other values (1226) 112297 97.6%
 

Minimum 5 values

Value Count Frequency (%)  
-7.29 69 0.1%
 
-6.61 69 0.1%
 
-6.08 70 0.1%
 
0.25 68 0.1%
 
2.32 71 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
94.1 45 0.0%
 
95.1 45 0.0%
 
95.51 45 0.0%
 
99.66 48 0.0%
 
101.95 187 0.2%
 

Type
Categorical

Distinct count 3
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
A
58713
B
44500
C
 
11851
Value Count Frequency (%)  
A 58713 51.0%
 
B 44500 38.7%
 
C 11851 10.3%
 

Unemployment
Numeric

Distinct count 90
Unique (%) 0.1%
Missing (%) 33.2%
Missing (n) 38162
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.8687
Minimum 3.684
Maximum 10.199
Zeros (%) 0.0%

Quantile statistics

Minimum 3.684
5-th percentile 3.932
Q1 5.771
Median 6.806
Q3 8.036
95-th percentile 9.91
Maximum 10.199
Range 6.515
Interquartile range 2.265

Descriptive statistics

Standard deviation 1.5834
Coef of variation 0.23053
Kurtosis -0.60933
Mean 6.8687
MAD 1.3101
Skewness 0.1414
Sum 528220
Variance 2.5072
Memory size 1.8 MiB
Value Count Frequency (%)  
6.237 3377 2.9%
 
9.91 2454 2.1%
 
6.17 2336 2.0%
 
6.266 2147 1.9%
 
5.372000000000001 1871 1.6%
 
8.036 1823 1.6%
 
7.107 1808 1.6%
 
3.932 1808 1.6%
 
7.439 1805 1.6%
 
8.625 1783 1.5%
 
Other values (79) 55690 48.4%
 
(Missing) 38162 33.2%
 

Minimum 5 values

Value Count Frequency (%)  
3.6839999999999997 556 0.5%
 
3.8789999999999996 650 0.6%
 
3.8960000000000004 288 0.3%
 
3.9210000000000003 932 0.8%
 
3.932 1808 1.6%
 

Maximum 5 values

Value Count Frequency (%)  
8.951 847 0.7%
 
9.151 588 0.5%
 
9.874 751 0.7%
 
9.91 2454 2.1%
 
10.199 1731 1.5%
 

Correlations

Sample

Store Dept Date IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment
0 1 1 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573
1 1 2 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573
2 1 3 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573
3 1 4 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573
4 1 5 2012-11-02 False A 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 2737.42 223.462779 6.573

Correlation matrix

In [14]:
# Correlation for train data
train_corr=pd.DataFrame(train.corr())
train_corr.head()
Out[14]:
Store Dept Weekly_Sales IsHoliday Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment
Store 1.000000 0.024258 -0.085117 -0.000522 -0.182763 -0.050230 0.065321 -0.119676 -0.034993 -0.031475 -0.009991 -0.026777 -0.211261 0.208759
Dept 0.024258 1.000000 0.148749 0.000663 -0.002491 0.004727 0.003544 -0.002512 0.000018 0.001855 0.004176 0.000295 -0.007178 0.007787
Weekly_Sales -0.085117 0.148749 1.000000 0.012843 0.244117 -0.002339 0.000089 0.085325 0.024565 0.060304 0.045325 0.090561 -0.021162 -0.025806
IsHoliday -0.000522 0.000663 0.012843 1.000000 0.000797 -0.155775 -0.078155 -0.035632 0.334327 0.428364 -0.000459 -0.053696 -0.001933 0.010555
Size -0.182763 -0.002491 0.244117 0.000797 1.000000 -0.058413 0.003632 0.345732 0.108843 0.048935 0.168266 0.304814 -0.003903 -0.068335
In [15]:
# Correlation for test data
test_corr=pd.DataFrame(test.corr())
test_corr.head()
Out[15]:
Store Dept IsHoliday Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment
Store 1.000000 0.019627 -0.001166 -0.186845 -0.043495 0.153425 -0.091707 -0.041370 -0.025177 0.010331 0.010419 -0.214872 0.250321
Dept 0.019627 1.000000 0.001249 0.001502 0.003970 0.000554 -0.002353 0.001292 0.000247 0.002510 0.000776 -0.006336 0.004087
IsHoliday -0.001166 0.001249 1.000000 -0.000443 -0.187428 -0.126443 0.355257 0.265402 0.496062 0.289700 -0.019386 -0.001475 0.010288
Size -0.186845 0.001502 -0.000443 1.000000 -0.061256 0.055088 0.309614 0.157526 0.050088 0.155448 0.103681 -0.002916 -0.001988
Temperature -0.043495 0.003970 -0.187428 -0.061256 1.000000 0.073938 -0.168899 -0.324280 -0.049771 -0.059583 0.003937 0.280861 0.022136
In [16]:
# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(train.corr())
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0xdb38b38>
In [17]:
# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(test.corr())
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1870d4e0>

Data Exploratory Analysis:

In [18]:
# Store wise sales plot
train['Store'].value_counts(normalize=True).plot(kind = 'bar',fig=(4,5))
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x19c1e400>
In [19]:
# weekly sales plot
sns.distplot(train.Weekly_Sales)
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x19c1e1d0>
In [20]:
# Store wise sales
train.plot(kind='line', x='Weekly_Sales', y='Store', alpha=0.5)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x573d198>

Sales Vs Type:

In [21]:
# Weekly sales Type wise
sns.barplot(x=train["Weekly_Sales"],y=train["Type"])
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x1249be80>

Department Weise sales:

In [22]:
train.plot(kind='line', x='Dept', y='Weekly_Sales', alpha=0.5,fig=(4,5))
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x175ffd68>

Missing value Treatment

In [23]:
print (train.isnull().sum())
print ("*"*30)
print (test.isnull().sum())
Store                0
Dept                 0
Date                 0
Weekly_Sales         0
IsHoliday            0
Type                 0
Size                 0
Temperature          0
Fuel_Price           0
MarkDown1       270031
MarkDown2       309308
MarkDown3       283561
MarkDown4       285694
MarkDown5       269283
CPI                  0
Unemployment         0
dtype: int64
******************************
Store               0
Dept                0
Date                0
IsHoliday           0
Type                0
Size                0
Temperature         0
Fuel_Price          0
MarkDown1         149
MarkDown2       28627
MarkDown3        9829
MarkDown4       12888
MarkDown5           0
CPI             38162
Unemployment    38162
dtype: int64

Imputing it with its mean

In [24]:
test['CPI']=test.groupby(['Dept'])['CPI'].transform(lambda x: x.fillna(x.mean()))
test['Unemployment']=test.groupby(['Dept'])['Unemployment'].transform(lambda x: x.fillna(x.mean()))

Other Missing Value Treatment like Markdown, Imputing it with Zero(No Markdown)

In [25]:
train=train.fillna(0)
test=test.fillna(0)
In [26]:
# Recheck the missing values.

print (train.isnull().sum())
print ("*"*30)
print (test.isnull().sum())
Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
dtype: int64
******************************
Store           0
Dept            0
Date            0
IsHoliday       0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
dtype: int64

Outlier Treatment

In [27]:
train.Weekly_Sales=np.where(train.Weekly_Sales>100000, 100000,train.Weekly_Sales)
In [28]:
train.Weekly_Sales.plot.hist(bins=25)
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x176834e0>

Feature Extraction

In this section, we select the appropriate features to train our classifier. Here, we create new features based on existing features. We also convert categorical features into numeric form.

In [29]:
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 420212 entries, 0 to 421569
Data columns (total 16 columns):
Store           420212 non-null int64
Dept            420212 non-null int64
Date            420212 non-null object
Weekly_Sales    420212 non-null float64
IsHoliday       420212 non-null bool
Type            420212 non-null object
Size            420212 non-null int64
Temperature     420212 non-null float64
Fuel_Price      420212 non-null float64
MarkDown1       420212 non-null float64
MarkDown2       420212 non-null float64
MarkDown3       420212 non-null float64
MarkDown4       420212 non-null float64
MarkDown5       420212 non-null float64
CPI             420212 non-null float64
Unemployment    420212 non-null float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 71.7+ MB

Date Feature

In [30]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
In [31]:
# Extract date features
train['Date_dayofweek'] =train['Date'].dt.dayofweek
train['Date_month'] =train['Date'].dt.month 
train['Date_year'] =train['Date'].dt.year
train['Date_day'] =train['Date'].dt.day 

# For test data
test['Date_dayofweek'] =test['Date'].dt.dayofweek
test['Date_month'] =test['Date'].dt.month 
test['Date_year'] =test['Date'].dt.year
test['Date_day'] =test['Date'].dt.day

Type Feature Details

In [32]:
print (train.Type.value_counts())
print ("*"*30)
print (test.Type.value_counts())
A    214961
B    162787
C     42464
Name: Type, dtype: int64
******************************
A    58713
B    44500
C    11851
Name: Type, dtype: int64

IsHoliday Feature Details

In [33]:
print (train.IsHoliday.value_counts())
print ("*"*30)
print (test.IsHoliday.value_counts())
False    390652
True      29560
Name: IsHoliday, dtype: int64
******************************
False    106136
True       8928
Name: IsHoliday, dtype: int64
In [34]:
# Combine train and test with Key
train_test_data = [train, test]

Converting Categorical Variable 'Type' into Numerical Variable For A=1 , B=2, C=3

In [35]:
type_mapping = {"A": 1, "B": 2, "C": 3}
for dataset in train_test_data:
    dataset['Type'] = dataset['Type'].map(type_mapping)

Converting Categorical Variable 'IsHoliday' into Numerical Variable

In [36]:
type_mapping = {False: 0, True: 1}
for dataset in train_test_data:
    dataset['IsHoliday'] = dataset['IsHoliday'].map(type_mapping)

Creating Extra Holiday Variable.If that week comes under extra holiday then 1(=Yes) else 2(=No)

Making New Holiday Variable Based on Given Data....

In [37]:
# For Train Data Set
train['Super_Bowl'] = np.where((train['Date']==datetime(2010, 2, 12)) | (train['Date']==datetime(2011, 2, 11)) | (train['Date']==datetime(2012, 2, 10)) | (train['Date']==datetime(2013, 2, 8)),1,0)
train['Labour_Day'] = np.where((train['Date']==datetime(2010, 9, 10)) | (train['Date']==datetime(2011, 9, 9)) | (train['Date']==datetime(2012, 9, 7)) | (train['Date']==datetime(2013, 9, 6)),1,0)
train['Thanksgiving'] = np.where((train['Date']==datetime(2010, 11, 26)) | (train['Date']==datetime(2011, 11, 25)) | (train['Date']==datetime(2012, 11, 23)) | (train['Date']==datetime(2013, 11, 29)),1,0)
train['Christmas'] = np.where((train['Date']==datetime(2010, 12, 31)) | (train['Date']==datetime(2011, 12, 30)) | (train['Date']==datetime(2012, 12, 28)) | (train['Date']==datetime(2013, 12, 27)),1,0)

#For Test Data set........................................................................
test['Super_Bowl'] = np.where((test['Date']==datetime(2010, 2, 12)) | (test['Date']==datetime(2011, 2, 11)) | (test['Date']==datetime(2012, 2, 10)) | (test['Date']==datetime(2013, 2, 8)),1,0)
test['Labour_Day'] = np.where((test['Date']==datetime(2010, 9, 10)) | (test['Date']==datetime(2011, 9, 9)) | (test['Date']==datetime(2012, 9, 7)) | (test['Date']==datetime(2013, 9, 6)),1,0)
test['Thanksgiving'] = np.where((test['Date']==datetime(2010, 11, 26)) | (test['Date']==datetime(2011, 11, 25)) | (test['Date']==datetime(2012, 11, 23)) | (test['Date']==datetime(2013, 11, 29)),1,0)
test['Christmas'] = np.where((test['Date']==datetime(2010, 12, 31)) | (test['Date']==datetime(2011, 12, 30)) | (test['Date']==datetime(2012, 12, 28)) | (test['Date']==datetime(2013, 12, 27)),1,0)
In [38]:
# Change the isHoliday value depending on these new holidays...
train['IsHoliday']=train['IsHoliday']|train['Super_Bowl']|train['Labour_Day']|train['Thanksgiving']|train['Christmas']
test['IsHoliday']=test['IsHoliday']|test['Super_Bowl']|test['Labour_Day']|test['Thanksgiving']|test['Christmas']
In [39]:
# Count of holiday for train data
print (train.Christmas.value_counts())
print (train.Super_Bowl.value_counts())
print (train.Thanksgiving.value_counts())
print (train.Labour_Day.value_counts())
0    414303
1      5909
Name: Christmas, dtype: int64
0    411339
1      8873
Name: Super_Bowl, dtype: int64
0    414266
1      5946
Name: Thanksgiving, dtype: int64
0    411380
1      8832
Name: Labour_Day, dtype: int64
In [40]:
# Count of holiday for Test data
print (test.Christmas.value_counts())
print (test.Super_Bowl.value_counts())
print (test.Thanksgiving.value_counts())
print (test.Labour_Day.value_counts())
0    112076
1      2988
Name: Christmas, dtype: int64
0    112100
1      2964
Name: Super_Bowl, dtype: int64
0    112088
1      2976
Name: Thanksgiving, dtype: int64
0    115064
Name: Labour_Day, dtype: int64
In [41]:
# Since we have Imputed IsHoliday according to Extra holidays..These extra holiday variable has redundant..
# Droping the Extra holiday variables because its redundant..
dp=['Super_Bowl','Labour_Day','Thanksgiving','Christmas']

train.drop(dp,axis=1,inplace=True)
test.drop(dp,axis=1,inplace=True)
In [42]:
train.head(2)
Out[42]:
Store Dept Date Weekly_Sales IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment Date_dayofweek Date_month Date_year Date_day
0 1 1 2010-02-05 24924.50 0 1 151315 42.31 2.572 0.0 0.0 0.0 0.0 0.0 211.096358 8.106 4 2 2010 5
1 1 2 2010-02-05 50605.27 0 1 151315 42.31 2.572 0.0 0.0 0.0 0.0 0.0 211.096358 8.106 4 2 2010 5

Feature Selection

Droping irrevelent variable:

-Since we have imputed markdown variables therefore we will not be removing the all markdown variables.
-Removing MarkDown5 because its Highly Skewed.
In [43]:
features_drop=['Unemployment','CPI','MarkDown5']
train=train.drop(features_drop, axis=1)
test=test.drop(features_drop, axis=1)
In [44]:
train.head(2)
Out[44]:
Store Dept Date Weekly_Sales IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 Date_dayofweek Date_month Date_year Date_day
0 1 1 2010-02-05 24924.50 0 1 151315 42.31 2.572 0.0 0.0 0.0 0.0 4 2 2010 5
1 1 2 2010-02-05 50605.27 0 1 151315 42.31 2.572 0.0 0.0 0.0 0.0 4 2 2010 5
In [45]:
test.head(2)
Out[45]:
Store Dept Date IsHoliday Type Size Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 Date_dayofweek Date_month Date_year Date_day
0 1 1 2012-11-02 0 1 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 4 11 2012 2
1 1 2 2012-11-02 0 1 151315 55.32 3.386 6766.44 5147.7 50.82 3639.9 4 11 2012 2
In [46]:
# Converting all float var to int integer..
for var in train:
    if train[var].dtypes == float:
        train[var]=train[var].astype(int)
        
for var in test:
    if test[var].dtypes == float:
        test[var]=test[var].astype(int)

First we should Check Y is normally distributed or not

In [47]:
import seaborn as sns
sns.distplot(train.Weekly_Sales)
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0xc2e0f98>

As we can see above fig y is not normally distributed so we will take log of Y

In [48]:
train['Weekly_Sales']=np.log(train['Weekly_Sales']+1)
In [49]:
sns.distplot(train.Weekly_Sales)
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0xc3a8588>
In [ ]:
## Use Box cox transformation need to plot.
In [50]:
# Now check residuals
from scipy import stats
import pylab

stats.probplot(train.Weekly_Sales, dist="norm", plot=pylab )
pylab.show()

Classification & Accuracy

Define training and testing set

In [51]:
#### train X= Exery thing except Weekly_Sales
train_X=train.drop(['Weekly_Sales','Date'], axis=1)

#### train Y= Only Weekly_Sales 
train_y=train['Weekly_Sales'] 

#### Test_X
test_X=test.drop('Date',axis=1).copy()

train_X.shape, train_y.shape, test_X.shape
Out[51]:
((420212, 15), (420212,), (115064, 15))

Building models & comparing their RMSE values

1.Linear Regression

In [52]:
## Methood 1..
clf = LinearRegression()
clf.fit(train_X, train_y)
y_pred_linear=clf.predict(test_X)
acc_linear=round( clf.score(train_X, train_y) * 100, 2)
print ('scorbe:'+str(acc_linear) + ' percent')
scorbe:11.03 percent
In [59]:
import statsmodels.api as sm
In [60]:
train_x = sm.add_constant(train_X)
lm=sm.OLS(train_y,train_X).fit()
In [61]:
print(lm.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Weekly_Sales   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.110
Method:                 Least Squares   F-statistic:                     3720.
Date:                Tue, 03 Sep 2019   Prob (F-statistic):               0.00
Time:                        21:56:39   Log-Likelihood:            -8.6782e+05
No. Observations:              420212   AIC:                         1.736e+06
Df Residuals:                  420197   BIC:                         1.736e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Store             -0.0134      0.000    -56.412      0.000      -0.014      -0.013
Dept               0.0016   9.65e-05     16.264      0.000       0.001       0.002
IsHoliday         -0.0520      0.012     -4.169      0.000      -0.076      -0.028
Type               0.1135      0.008     14.836      0.000       0.099       0.129
Size            1.084e-05   8.38e-08    129.421      0.000    1.07e-05     1.1e-05
Temperature       -0.0036      0.000    -21.059      0.000      -0.004      -0.003
Fuel_Price         0.0329      0.008      4.130      0.000       0.017       0.049
MarkDown1       1.071e-05   1.02e-06     10.539      0.000    8.72e-06    1.27e-05
MarkDown2      -8.538e-07   6.17e-07     -1.384      0.166   -2.06e-06    3.56e-07
MarkDown3       3.695e-06   5.59e-07      6.613      0.000     2.6e-06    4.79e-06
MarkDown4      -6.834e-06   1.43e-06     -4.794      0.000   -9.63e-06   -4.04e-06
Date_dayofweek    35.7844      3.199     11.187      0.000      29.515      42.054
Date_month         0.0160      0.001     16.088      0.000       0.014       0.018
Date_year         -0.0676      0.006    -10.621      0.000      -0.080      -0.055
Date_day          -0.0004      0.000     -1.049      0.294      -0.001       0.000
==============================================================================
Omnibus:                    79726.360   Durbin-Watson:                   1.429
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           156534.468
Skew:                          -1.156   Prob(JB):                         0.00
Kurtosis:                       4.896   Cond. No.                     1.63e+08
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.63e+08. This might indicate that there are
strong multicollinearity or other numerical problems.

2. Random Forest

In [53]:
clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_X, train_y)
y_pred_rf=clf.predict(test_X)
acc_rf= round(clf.score(train_X, train_y) * 100, 2)
print ("Accuracy: %i %% \n"%acc_rf)
Accuracy: 99 % 

3. Decision Tree

In [54]:
clf=DecisionTreeRegressor()
clf.fit(train_X, train_y)
y_pred_dt= clf.predict(test_X)
acc_dt = round( clf.score(train_X, train_y) * 100, 2)
print (str(acc_dt) + ' percent')
100.0 percent

Comparing Models

Let's compare the accuracy score of all the regression models used above.

In [55]:
models = pd.DataFrame({
    'Model': ['Linear Regression','Random Forest','Decision Tree'],
    
    'Score': [acc_linear, acc_rf,acc_dt]
    })

models.sort_values(by='Score', ascending=False)
Out[55]:
Model Score
2 Decision Tree 100.00
1 Random Forest 99.63
0 Linear Regression 11.03

Predicting Sales value for test data based on highest score model.

In [57]:
# Prediction value using Random Forest model..
submission = pd.DataFrame({
        "Store_Dept_Date": test.Store.astype(str)+'_'+test.Dept.astype(str)+'_'+test.Date.astype(str),
        "Weekly_Sales": y_pred_rf
    })

submission.to_csv('weekly_sales predicted.csv', index=False)
#submission.to_excel(writer,'Weekly_sales Pred',index=False)
In [58]:
submission.head()
Out[58]:
Store_Dept_Date Weekly_Sales
0 1_1_2012-11-02 10.268372
1 1_2_2012-11-02 10.766197
2 1_3_2012-11-02 9.306946
3 1_4_2012-11-02 10.562144
4 1_5_2012-11-02 10.341259
In [ ]:
##########################End##########

Happy Learning....